#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#   http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

args <- commandArgs(TRUE)

library("Matrix")
library("matrixStats")

options(warn=-1)


id3_learn = function(X, y, X_subset, attributes, minsplit)
{
	#try the two base cases first

	#of the remaining samples, compute a histogram for labels
	hist_labels_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(y)), FUN=sum))
	hist_labels = as.matrix(hist_labels_helper[,2])

	#go through the histogram to compute the number of labels
	#with non-zero samples
	#and to pull out the most popular label
	
	num_non_zero_labels = sum(hist_labels > 0);
	most_popular_label = which.max(t(hist_labels));
	num_remaining_attrs = sum(attributes)
	
	num_samples = sum(X_subset)
	mpl = as.numeric(most_popular_label)
	
	nodes = matrix(0, 1, 1)
	edges = matrix(0, 1, 1)
	
	#if all samples have the same label then return a leaf node
	#if no attributes remain then return a leaf node with the most popular label	
	if(num_non_zero_labels == 1 | num_remaining_attrs == 0 | num_samples < minsplit){
		nodes = matrix(0, 1, 2)
		nodes[1,1] = -1
		nodes[1,2] = most_popular_label
		edges = matrix(-1, 1, 1)
	}else{
		#computing gains for all available attributes using parfor
		hist_labels2_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(y)), FUN=sum))
		hist_labels2 = as.matrix(hist_labels2_helper[,2])
		num_samples2 = sum(X_subset)
		zero_entries_in_hist1 = (hist_labels2 == 0)
		pi1 = hist_labels2/num_samples2
		log_term1 = zero_entries_in_hist1*1 + (1-zero_entries_in_hist1)*pi1
		entropy_vector1 = -pi1*log(log_term1)
		ht = sum(entropy_vector1)
		
		sz = nrow(attributes)
		gains = matrix(0, sz, 1)
		for(i in 1:nrow(attributes)){
			if(as.numeric(attributes[i,1]) == 1){
				attr_vals = X[,i]
				attr_domain_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(attr_vals)), FUN=sum))
				attr_domain = as.matrix(attr_domain_helper[,2])

				hxt_vector = matrix(0, nrow(attr_domain), 1)
				
				for(j in 1:nrow(attr_domain)){
					if(as.numeric(attr_domain[j,1]) != 0){
						val = j
						Tj = X_subset * (X[,i] == val)
						
						#entropy = compute_entropy(Tj, y)
						hist_labels1_helper = as.matrix(aggregate(as.vector(Tj), by=list(as.vector(y)), FUN=sum))
						hist_labels1 = as.matrix(hist_labels1_helper[,2])
						num_samples1 = sum(Tj)
						zero_entries_in_hist = (hist_labels1 == 0)
						piv = hist_labels1/num_samples1
						log_term = zero_entries_in_hist*1 + (1-zero_entries_in_hist)*piv
						entropy_vector = -piv*log(log_term)
						entropy = sum(entropy_vector)
	
						hxt_vector[j,1] = sum(Tj)/sum(X_subset)*entropy
					}
				}
				hxt = sum(hxt_vector)
				gains[i,1] = (ht - hxt)
			}
		}
		
		#pick out attr with highest gain
		best_attr = -1
		max_gain = 0
		for(i4 in 1:nrow(gains)){
			if(as.numeric(attributes[i4,1]) == 1){
				g = as.numeric(gains[i4,1])
				if(best_attr == -1 | max_gain <= g){
					max_gain = g
					best_attr = i4
				}
			}
		}		
		
		attr_vals = X[,best_attr]
		attr_domain_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(attr_vals)), FUN=sum))
		attr_domain = as.matrix(attr_domain_helper[,2])

		new_attributes = attributes
		new_attributes[best_attr, 1] = 0
		
		max_sz_subtree = 2*sum(X_subset)
		sz2 = nrow(attr_domain)
		sz1 = sz2*max_sz_subtree
		
		tempNodeStore = matrix(0, 2, sz1)
		tempEdgeStore = matrix(0, 3, sz1)
		numSubtreeNodes = matrix(0, sz2, 1)
		numSubtreeEdges = matrix(0, sz2, 1)
		
		for(i1 in 1:nrow(attr_domain)){
			
			Ti = X_subset * (X[,best_attr] == i1)
			num_nodes_Ti = sum(Ti)
			
			if(num_nodes_Ti > 0){
				tmpRet <- id3_learn(X, y, Ti, new_attributes, minsplit)
			  nodesi = as.matrix(tmpRet$a);
        edgesi = as.matrix(tmpRet$b);
      
				start_pt = 1+(i1-1)*max_sz_subtree
        tempNodeStore[,start_pt:(start_pt+nrow(nodesi)-1)] = t(nodesi)
			
        numSubtreeNodes[i1,1] = nrow(nodesi)
				if(nrow(edgesi)!=1 | ncol(edgesi)!=1 | as.numeric(edgesi[1,1])!=-1){
					tempEdgeStore[,start_pt:(start_pt+nrow(edgesi)-1)] = t(edgesi)
					numSubtreeEdges[i1,1] = nrow(edgesi)
				}else{
					numSubtreeEdges[i1,1] = 0
				}
			}
		}
		
		num_nodes_in_subtrees = sum(numSubtreeNodes)
		num_edges_in_subtrees = sum(numSubtreeEdges)
		
		#creating root node
		sz = 1+num_nodes_in_subtrees
		
		nodes = matrix(0, sz, 2)
		nodes[1,1] = best_attr
		numNodes = 1
		
		#edges from root to children
		sz = sum(numSubtreeNodes > 0) + num_edges_in_subtrees
		
		edges = matrix(1, sz, 3)
		numEdges = 0
		for(i6 in 1:nrow(attr_domain)){
			num_nodesi = as.numeric(numSubtreeNodes[i6,1])
			if(num_nodesi > 0){
				edges[numEdges+1,2] = i6
				numEdges = numEdges + 1
			}
		}
		
		nonEmptyAttri = 0
		for(i7 in 1:nrow(attr_domain)){
			numNodesInSubtree = as.numeric(numSubtreeNodes[i7,1])
		
			if(numNodesInSubtree > 0){
				start_pt1 = 1 + (i7-1)*max_sz_subtree
				nodes[(numNodes+1):(numNodes+numNodesInSubtree),] = t(tempNodeStore[,start_pt1:(start_pt1+numNodesInSubtree-1)])
			
				numEdgesInSubtree = as.numeric(numSubtreeEdges[i7,1])
			
				if(numEdgesInSubtree!=0){
					edgesi1 = t(tempEdgeStore[,start_pt1:(start_pt1+numEdgesInSubtree-1)])
					edgesi1[,1] = edgesi1[,1] + numNodes
					edgesi1[,3] = edgesi1[,3] + numNodes
          
					edges[(numEdges+1):(numEdges+numEdgesInSubtree),] = edgesi1
					numEdges = numEdges + numEdgesInSubtree
				}
			
				edges[nonEmptyAttri+1,3] = numNodes + 1
				nonEmptyAttri = nonEmptyAttri + 1
				
				numNodes = numNodes + numNodesInSubtree
			}
		}
	}
  
  return ( list(a=nodes, b=edges) );
}

X = readMM(paste(args[1], "X.mtx", sep=""));
y = readMM(paste(args[1], "y.mtx", sep=""));

n = nrow(X)
m = ncol(X)

minsplit = 2


X_subset = matrix(1, n, 1)
attributes = matrix(1, m, 1)
# recoding inputs
featureCorrections = as.vector(1 - colMins(as.matrix(X)))
onesMat = matrix(1, n, m)

X = onesMat %*% diag(featureCorrections) + X
labelCorrection = 1 - min(y)
y = y + labelCorrection + 0

tmpRet <- id3_learn(X, y, X_subset, attributes, minsplit)
nodes = as.matrix(tmpRet$a)
edges = as.matrix(tmpRet$b)

# decoding outputs
nodes[,2] = nodes[,2] - labelCorrection * (nodes[,1] == -1)
for(i3 in 1:nrow(edges)){
	e_parent = as.numeric(edges[i3,1])
	parent_feature = as.numeric(nodes[e_parent,1])
	correction = as.numeric(featureCorrections[parent_feature])
	edges[i3,2] = edges[i3,2] - correction
}

writeMM(as(nodes,"CsparseMatrix"), paste(args[2],"nodes", sep=""), format = "text")
writeMM(as(edges,"CsparseMatrix"), paste(args[2],"edges", sep=""), format = "text")

